This lab draws on examples in
Two of the packages are available through Bioconductor
library(tidyverse)
library(knitr)
library(ggtree)
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
library(treeio)
library(tidytree)
library(ape)
library(TreeTools)
library(phytools)
library(ggnewscale)
library(ggtreeExtra)
library(ggstar)Let’s load the MAG table with the new GTDB annotations with the updated taxonomy database
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") We are going to remove the re-annotation and
WREF plot samples from the data set you export last week
from IMG.
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T)) Now let’s reformat Genome Name as we did for the above
MAG table
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") ## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
You first need to find the internal node on the tree corresponding to your Domain, Phylum or Class. Below is an expample
# Make a vector with the internal node labels
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
# Search for your Phylum or Class to get the node
grep("Chloroflexota", node_vector_bac, value = TRUE)## [1] "'0.996:p__Chloroflexota'"
## [1] 1712
ggtree(tree_Chloroflexota, layout="circular") +
geom_tiplab(aes(angle=angle))+
theme_tree() +
xlim(0,20)
Circular with taxonomic group highlighted
### Circular no branch lengths taxonomic group labeled
ggtree(tree_bac, layout="circular", branch.length="none") +
geom_hilight(node=1712, fill="steelblue", alpha=.6) +
geom_cladelab(node=1712, label="Chloroflexota", align=TRUE,
offset = 0, textcolor='steelblue', barcolor='steelblue')
### Circular no branch lengths taxonomic group collapsed
ggtree(tree_bac, layout="circular", branch.length="none") %>%
collapse(node=1712) +
geom_point2(aes(subset=(node==55)), shape=23, size=5, fill='steelblue') +
geom_cladelab(node=55, label="Chloroflexota", align=TRUE,
offset = 2, textcolor='steelblue')
### Circular no branch lengths taxonomic group collapsed as triangle and
labeled
p <- ggtree(tree_bac, layout="circular", branch.length="none")
scaleClade(p, 1712, .2) %>% collapse(1712, 'min', fill="steelblue") +
geom_cladelab(node=1712, label="Chloroflexota", align=TRUE,
offset = 0.2, textcolor='steelblue')Note the special symbol %<+%. This is for joining the
tree object with your data
ggtree(tree_Chloroflexota, layout="circular") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tiplab(size=2, hjust=-.1) +
xlim(0,20) +
geom_point(mapping=aes(color=Class)) ggtree(tree_Chloroflexota, layout="circular") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tiplab(size=2, hjust=-.1) +
xlim(0,20) +
geom_point(mapping=aes(color=Class, shape = `Assembly Type`)) ## Warning: Removed 46 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggtree(tree_Chloroflexota) %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tiplab(size=2, hjust=-.1) +
xlim(0,20) +
geom_point(mapping=aes(color=`Ecosystem Subtype`)) ggtree(tree_Chloroflexota) %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tippoint(aes(colour=`Bin Completeness`)) +
scale_colour_gradient(low='blue', high='red') +
geom_tiplab(size=2, hjust=-.1) +
xlim(0,15) ggtree(tree_Chloroflexota, layout="circular") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`))## Warning: Removed 46 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).
# For unknown reasons the following does not like blank spaces in the names
NEON_MAGs_metagenomes_chemistry_noblank <- NEON_MAGs_metagenomes_chemistry %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype")
ggtree(tree_Chloroflexota) %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tippoint(aes(colour=`Ecosystem Subtype`)) +
# For unknown reasons the following does not like blank spaces in the names
geom_facet(panel = "Bin Completeness", data = NEON_MAGs_metagenomes_chemistry_noblank, geom = geom_point,
mapping=aes(x = BinCompleteness)) +
geom_facet(panel = "Bin Contamination", data = NEON_MAGs_metagenomes_chemistry_noblank, geom = geom_col,
aes(x = BinContamination), orientation = 'y', width = .6) +
theme_tree2(legend.position=c(.1, .7))ggtree(tree_Chloroflexota, layout="circular", branch.length="none") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`)) +
new_scale_fill() +
geom_fruit(
data=NEON_MAGs_metagenomes_chemistry_noblank,
geom=geom_tile,
mapping=aes(y=label, x=1, fill= AssemblyType),
offset=0.08, # The distance between external layers, default is 0.03 times of x range of tree.
pwidth=0.25 # width of the external layer, default is 0.2 times of x range of tree.
) ## ! The following column names/name: Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH are/is the same to tree data, the tree data column names are : label, y, angle, Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, Bin Completeness, Bin Contamination, Total Number of Bases, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, Assembly Type, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Subtype, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH.
## Warning: Removed 46 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).
### Adding multiple rings to a circular tree
ggtree(tree_Chloroflexota, layout="circular", branch.length="none") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`)) +
new_scale_fill() +
geom_fruit(
data=NEON_MAGs_metagenomes_chemistry_noblank,
geom=geom_tile,
mapping=aes(y=label, x=1, fill= AssemblyType),
offset=0.08, # The distance between external layers, default is 0.03 times of x range of tree.
pwidth=0.25 # width of the external layer, default is 0.2 times of x range of tree.
) +
new_scale_fill() +
geom_fruit(
data=NEON_MAGs_metagenomes_chemistry_noblank,
geom=geom_col,
mapping=aes(y=label, x=TotalNumberofBases),
pwidth=0.4,
axis.params=list(
axis="x", # add axis text of the layer.
text.angle=-45, # the text size of axis.
hjust=0 # adjust the horizontal position of text of axis.
),
grid.params=list() # add the grid line of the external bar plot.
) +
theme(#legend.position=c(0.96, 0.5), # the position of legend.
legend.background=element_rect(fill=NA), # the background of legend.
legend.title=element_text(size=7), # the title size of legend.
legend.text=element_text(size=6), # the text size of legend.
legend.spacing.y = unit(0.02, "cm") # the distance of legends (y orientation).
) ## ! The following column names/name: Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH are/is the same to tree data, the tree data column names are : label, y, angle, Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, Bin Completeness, Bin Contamination, Total Number of Bases, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, Assembly Type, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Subtype, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH.
## ! The following column names/name: Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH are/is the same to tree data, the tree data column names are : label, y, angle, Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, Bin Completeness, Bin Contamination, Total Number of Bases, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, Assembly Type, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Subtype, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH, xmaxtmp.
## Warning: Removed 46 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).